{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "train = pd.read_csv('../data/train_xy.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test = pd.read_csv('../data/test_all.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cust_id</th>\n",
       "      <th>cust_group</th>\n",
       "      <th>y</th>\n",
       "      <th>x_1</th>\n",
       "      <th>x_2</th>\n",
       "      <th>x_3</th>\n",
       "      <th>x_4</th>\n",
       "      <th>x_5</th>\n",
       "      <th>x_6</th>\n",
       "      <th>x_7</th>\n",
       "      <th>...</th>\n",
       "      <th>x_148</th>\n",
       "      <th>x_149</th>\n",
       "      <th>x_150</th>\n",
       "      <th>x_151</th>\n",
       "      <th>x_152</th>\n",
       "      <th>x_153</th>\n",
       "      <th>x_154</th>\n",
       "      <th>x_155</th>\n",
       "      <th>x_156</th>\n",
       "      <th>x_157</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>110000</td>\n",
       "      <td>group_3</td>\n",
       "      <td>0</td>\n",
       "      <td>0.354167</td>\n",
       "      <td>0.604988</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>-99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>110001</td>\n",
       "      <td>group_3</td>\n",
       "      <td>0</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.012058</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>110002</td>\n",
       "      <td>group_3</td>\n",
       "      <td>0</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.565979</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>110003</td>\n",
       "      <td>group_3</td>\n",
       "      <td>0</td>\n",
       "      <td>0.208333</td>\n",
       "      <td>0.316209</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>110004</td>\n",
       "      <td>group_3</td>\n",
       "      <td>0</td>\n",
       "      <td>0.208333</td>\n",
       "      <td>0.008061</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 160 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   cust_id cust_group  y       x_1       x_2  x_3  x_4  x_5  x_6  x_7  ...    \\\n",
       "0   110000    group_3  0  0.354167  0.604988  -99  -99  -99  -99  -99  ...     \n",
       "1   110001    group_3  0  0.125000  0.012058  -99  -99  -99  -99  -99  ...     \n",
       "2   110002    group_3  0  0.333333  0.565979    0    0    0    0    0  ...     \n",
       "3   110003    group_3  0  0.208333  0.316209    0    0    0    0    1  ...     \n",
       "4   110004    group_3  0  0.208333  0.008061  -99  -99  -99  -99  -99  ...     \n",
       "\n",
       "   x_148  x_149  x_150  x_151  x_152  x_153  x_154  x_155  x_156  x_157  \n",
       "0      1      1      1      1      1      1      1      1      3    -99  \n",
       "1      1      1      1      1      1      1      1      1      2      2  \n",
       "2      1      1      2      1      1      1      1      1      2      2  \n",
       "3      2      1      1      1      1      1      1      1      2      4  \n",
       "4      1      1      1      1      1      1      1      1      2      1  \n",
       "\n",
       "[5 rows x 160 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(15000, 160)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cust_id</th>\n",
       "      <th>cust_group</th>\n",
       "      <th>x_1</th>\n",
       "      <th>x_2</th>\n",
       "      <th>x_3</th>\n",
       "      <th>x_4</th>\n",
       "      <th>x_5</th>\n",
       "      <th>x_6</th>\n",
       "      <th>x_7</th>\n",
       "      <th>x_8</th>\n",
       "      <th>...</th>\n",
       "      <th>x_148</th>\n",
       "      <th>x_149</th>\n",
       "      <th>x_150</th>\n",
       "      <th>x_151</th>\n",
       "      <th>x_152</th>\n",
       "      <th>x_153</th>\n",
       "      <th>x_154</th>\n",
       "      <th>x_155</th>\n",
       "      <th>x_156</th>\n",
       "      <th>x_157</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>group_1</td>\n",
       "      <td>0.291667</td>\n",
       "      <td>0.555388</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>group_3</td>\n",
       "      <td>0.270833</td>\n",
       "      <td>0.770302</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>group_3</td>\n",
       "      <td>0.354167</td>\n",
       "      <td>0.440327</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>group_1</td>\n",
       "      <td>0.208333</td>\n",
       "      <td>0.476509</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>group_1</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.955286</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 159 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   cust_id cust_group       x_1       x_2  x_3  x_4  x_5  x_6  x_7  x_8  \\\n",
       "0        1    group_1  0.291667  0.555388  -99  -99  -99  -99  -99  -99   \n",
       "1        2    group_3  0.270833  0.770302    0    0    0    0    1    1   \n",
       "2        3    group_3  0.354167  0.440327    0    0    0    0    4    3   \n",
       "3        4    group_1  0.208333  0.476509    0    0    0    0    1    1   \n",
       "4        5    group_1  0.125000  0.955286    0    0    0    0    2    1   \n",
       "\n",
       "   ...    x_148  x_149  x_150  x_151  x_152  x_153  x_154  x_155  x_156  x_157  \n",
       "0  ...        1      1      1      1      1      1      2      2      2      3  \n",
       "1  ...        1      1      1      1      1      1      2      2      1     10  \n",
       "2  ...        1      1      1      1      1      1      1      1      3      3  \n",
       "3  ...        1      1      1      1      1      1      1      1      1      4  \n",
       "4  ...        1      1      1      1      1      1      1      1      1     10  \n",
       "\n",
       "[5 rows x 159 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "x_train = train.drop(['cust_group','y','cust_id'],axis =1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "x_test = test.drop(['cust_group','cust_id'],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(15000, 157)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10000, 157)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(25000, 157)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x = pd.concat([x_train,x_test])\n",
    "x.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x2349720d9b0>"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAPkAAADuCAYAAAD7nKGzAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvqOYd8AAAGE1JREFUeJzt3XmYFNW9xvHv6Z5hZlhlXwQtFAUV\nRRRjFBGuihgKlxi9KipkMYlGk3hjTOpeE4PJVSsxV2NyTW6iYlRc467lIxrAHRWXqGDYgqUi6rDO\nMDPM0t11/6hGWQamZ6a7T9Xp3+d55hkemO5+h2feOaeqT51SQRAghDBXQncAIURhScmFMJyUXAjD\nScmFMJyUXAjDScmFMJyUXAjDScmFMJyUXAjDScmFMJyUXAjDScmFMJyUXAjDScmFMJyUXAjDScmF\nMJyUXAjDScmFMJyUXAjDScmFMJyUXAjDScmFMJyUXAjDScmFMJyUXAjDScmFMJyUXAjDScmFMJyU\nXAjDScmFMFyZ7gCiOCzH6wkMA/oA5UCXHT5v/XMaqM1+1AAbgWrftWs1xBZ5oOT+5GawHK8LcAhw\nAGGZ98p+bP1zz06+RCNQDXwE/BNYsvXDd+01nXxuUUBS8hiyHE8B+wFfAo7Mfh4DVGiKtImw8O8B\nbwLzfdderimL2IGUPCYsxzsEsIFJwBFAb62B2vYhMA/4O/B337WrNecpWVLyiLIcrxw4HjiVsNzD\n9CbqlABYTFj4R4HnfdeWH7wikZJHiOV4SWAKcCZhuaM+WnfUB8Ac4HbftVfoDmM6KXkEWI43EPg2\n8F1gqOY4xfYKcAdwr+/aG3WHMZGUXCPL8cYDFwNfI3z7qpQ1AY8Dv/dd+wXdYUwiJS8yy/G6AucS\nlnuM5jhR9QpwHfCI79oZ3WHiTkpeJNn3sS8ErgAGaI4TF8uAq4G7fddO6w4TV1LyArMcLwGcB1wF\nWHrTxNZy4L+RsneIlLyALMebBlwDHKw7iyHeAS72XftF3UHiREpeAJbjHQX8BjhGdxZD3Qlc7rv2\nZ7qDxIGUPI8sx+sOuMD3AKU5julqgCuBm2QKv3tS8jyxHO944BbkuLvY3iacwr+kO0hUSck7KXsJ\n53XAd3RnKWEB8EfgMt+1m3SHiRopeSdYjjcFuJl4rys3ydvAWb5rL9MdJEqk5B1gOV4FcCPhMlQR\nLfWE0/fbdQeJCil5O1mOtzfwADBOdxaxW3cC3/Ndu053EN2k5O1gOd6JwN1AX91ZRE5WEE7f39Id\nRCfZyDFHluP9CHgSKXic7Ae8bDneabqD6CQjeRuya87/BHxTdxbRYRngEt+1/6Q7iA5S8t3Ivj32\nGDBRdxaRF9f6rv1fukMUm5R8FyzH2wOYS7hJojDH7cAFvmundAcpFil5KyzH6ws8A4zVnUUUxFzg\njFI58y4l34HleAMINxyUK8fM9jow2XftTbqDFJqcXd+G5XhDgOeQgpeCccBTluP10B2k0KTkWZbj\nDSMs+CjdWUTRHAk8YTlele4ghSTTdcByvD7AQmB/3VmEFnOBk33XbtEdpBBKfiTPvg/+EFLwUjYF\nuC17+ynjlHzJgb8g74OLcAfd63SHKISSLrnleFcAM3XnEJFxmeV4l+gOkW8le0xuOd5ZwD3INk1i\ney3ARN+1F+oOki8lWfLsRovzgUrdWUQkrQYO8117re4g+VBy0/XsYpeHkYKLXRsK3JXdMz/2jPgm\n2mk2MFB3CBF5k4Ff6A6RDyU1Xbcc7yLCDf+EyEUATPVd+yndQTqjZEpuOd5I4E2gq+4sIlbWEx6f\nf6g7SEeVxHTdcrxy4C6k4KL9+hKupYitkig54c0GD9cdQsTWFMvxztEdoqOMn65bjncM4YUnpfIL\nTRRGNTDKd+2NuoO0l9E/+Nlp+s0Y/n2KohhATJe9mv7Dfyly6ajIn29ajnes7hDtZex0PbsBxFLA\n+E0BRFEtBcb4rt2sO0iuTB7JXaTgIv9GAY7uEO1h5EhuOd5hhHt4ycUnohDqgOG+a6/THSQXpo7k\n1yEFF4XTHbhcd4hcGTeSW443FfB05xDGqwf28V27WneQtpg4kl+lO4AoCd2An+gOkQujRnLL8SYC\nz+rOIUrGFsLR/FPdQXbHtJH8x7oDiJJSRQzOtBszkluONwp4DznhJoqrEdjXd+01uoPsikkj+WVI\nwUXxVQKR3vzRiJHccryBwAdAhe4soiRVA8OiugrOlJH8EqTgQp8BwOm6Q+xK7EuevQPKRbpziJIX\n2Z/B2Jec8BY3fXWHECXvWMvxRugO0RoTSn627gBCZH1dd4DWxPrEW/aWs9WEa4mF0O0jwPJdO6M7\nyLbiPpJPQwouomMYMEl3iB3FveSx3VxPGOsU3QF2FNuSW47XE/iK7hxC7GCq7gA7im3JgdOQ+5mJ\n6NnPcrx9dYfYVpxLfrLuAELsQqRG8ziXPHa7ZoqSEamSx/IttOwVZ//UnUOIXWgE+viuvUV3EIjv\nSD5RdwAhdqMSOE53iK3iWnKZqouom6w7wFZSciEKIzI32IzdMbnleMOBVbpzCNGGzUAv37W1F0zr\nSK6UOkkptUwptVIpleteWXI8LuKgBxCJ98u1lVwplQRuIly1diBwjlLqwBweGplpkBBtOFR3ANA7\nkn8JWBkEwaogCJqBe4FTc3hcLr8IhIiCsboDgN6S70l4ad5Wq7N/1xYpuYiLkh/JW9tZdbcnKSzH\n2wMYVJg4QuRdyY/kqwmvv91qKNDW3tX7FS6OEHk32HK8PrpDlGl87UXAfkqp4cDHhNs4TW/jMcPz\n8cK1rz9K3dtzIYDuY6bQ84hT2fTiXdS9PZdE114A9D52BlX7HrHd44JUM5/e/VOCVAtkMnQdOZ49\nJpwLwJYP3mbTgtkE6Ra6DBpB36/8EJVIUr/sJWpeuItEVXf6n/4zklU9adn4CZuev4P+p/40H9+O\niLYhwAadAbSVPAiClFLqEmAukARmB0GwpI2H7dPZ121e61P39lwGzbgelSyn+v4rqdp3HAA9xp1G\nryN3s7NuspyBZ19DoksVQTrFp3f9hKp9DqfLkP1Z793AwLOvprzPnmx6YQ51786jx5gT2fzawww6\n/7fU//N56t97jp6Hn8ymF+5kjwnndfZbEfEwCFisM4DW98mDIHgyCIL9gyDYNwiCq3N4SKdH8pb1\nq6kYMopEeSUqkaRi2GgaVizM6bFKKRJdqgAIMinIpEEpMls2o5LllPcJzxtWWofSsPyl7IMSBOkW\nglQTKpGk8aPFJLv1/vxrhfG0n0OK27LWTv+Hdem3N40fLSa9pZZMSyNbVr1OunYdAJvffII1sy9h\n3ZO/I91Y1+rjg0yaNbd9n9V/OI9K61AqhowkUdWTIJOi6ZMVADQse+nz5+w1/hyq77+SRv8fdDtw\nIjUv30ev8bJrVQnRXnKdx+Qd0aOzT1Debxg9jzyD6vt+jiqvpMuA4ZBI0mPsVHodfTYoxaYX5rBx\n/i30m3rpTo9XiSRDvvEHMo11VD98Nc1rfbr0t+h/yk/YOP9mgnQLldZhkEgCUDV8LFXDw5Osde/O\no2rfcaTWr2bDaw+RqOxO7xO+Q6JcNrgxmPaSx20k73TJAXqMOZHBX7+RQef+mkRlD8p7DyHZrTcq\nkUSpBD3GTKH5k+W7fY5EZXcqhx3MllVvAlCx5wEMOvc3DJ5xA5XDDqK895Dtvj7T0kjd4nn0GGuz\n8fnb6Tv1UroMGkH9kmfz8S2J6JKSt1PPfDxJun4TAKnaahqWL6TrgRNJ1X1xArRh+ULK++298+Ma\nashkp/GZliYaP/gH5X2HbvecQaqF2lcfoPvY7feYrH31QXqOOwWVLCNoyd4XTyUIUk35+JZEdGkv\neclN1wHWPnINmS2bIZGkz+QLSVZ2Z90T/0PzZ6tAKcp6DaDPlPButKnN61n/1O8ZeOZVpOs2sM67\nAYIMBBm6jppA1xFfAqD2tYdoWPkaENDj0KlU7T3m89dLbV5P86cr2eOY8O22nl/6Kp/e+WMSld3o\nf/rP8vEtiegaoDtArC41tRxvM3IzBREvy3zXHqUzQGym65bjKaCb7hxCtJP22XKbJVdKXaKU6l2M\nMG3oTuvr3YWIMu0lzyXAIGCRUupNYDYwN9Azxy/X8Jol4yD1/sppyVfWlJPSHcUoKZKbwNaaIadj\ncqWUAk4EvgGMA+4Hbg2C4F+FjfcFy/EqgUhscWuqntTVnJl8/r2zkgsyI9THByYUUZjBxd2HzKrZ\n+a2aIsr5xJtSagxhyU8CFgBfBp4JguAnhYu3Pcvx0sToPEKcJcikJybeXjIj+fTGoxLvDa1ULZHY\nyiiGfGbV5OXCqo5qc7qulPoBMBNYB9wCXB4EQYtSKgGsAIpWcqABObteFBkSyQWZsYcsyISr9Yar\nNR/OSD7jT0u+0q0fNQcrRRfNEeNC+/FPLsfk/YDTgyD4YNu/DIIgo5SaVphYu1SPlFyL94Mhe12V\nmrnXVamZdGPL5tOTL755dnJ+6gD14ciECvoX6nXTmYBxN9ezZ48ET0zvutO/37+khVnPNqEUjBmY\n4O6vhV9z0px6Xlmd5pi9yrZ73LkPNfDuZxmm7V/GNceHy4l/9VwThwxMcOqogpz2qS3Ek7ZHmyUP\nguDK3fxbsW9VVF/k1xOtqKeqx53pyV++Mz0ZCIKjE0uWzEzOXTch8e7grqp5/3y+1o2vNnNAvwS1\nrSwMXLE+zbUvNvHSN7vRu0pRXZ/5/N8uP7qChpaAP7/R8vnfvfNZOvx8UXcm3FZPTWNAQ0vAa2vS\n/HxiRT5jb2t9oZ44V9pP77eTlDxylHo5M/qglzOjARiq1q45L/nMylOTL1UNYuPBSnX89tKrazN4\nK1JcMaGC6xc27/TvN7/ZwsVHdKF3VfjO6oBuX5yuOX6fMp71t58plydgSwtkgoDmdEAyAVcuaOKX\nkwpWcJCSt5uUPOJWB/2HuKnpQ9zUdKpoapiWXLjo3OS8xoPVqv2SKmjXOu5Ln2rkNydUsrm59ZPD\ny9eHI/f42fWkMzBrUgUnjdj1j/QB/ZPs1SvBYX+u5/xDylm5IUMAjB2cbE+s9tK6KwzEr+SbdAcQ\nudtCRde/pScd8bf0JAAOV8uWzix7+rPjEm/170bjAUrtenHTE8tbGNBNcfiQ5E4j8lapDKzYkOHZ\nmV1ZXRsw4bZ6Fn+vO3tU7nrN1O9O+mJicfI9Dfx5WiVXP9/E25+lmbxPGd8+PO/nE2Ukbye5PVKM\nvRGMHPVGy8hRAAPZUD29bN7y0xMvlg9Va0crtf2S5Zc+TPPYshRPrthMYwpqmwLOe2gLc06v+vxr\nhvZUfHlokvKkYnhvxch+CVasz3DEnm2PzI8ubWHc4CT1zQGL16a5/8yuHHtbPeceUk7X8rwurNQ+\nksftPeeiLb4RhfUZfQbckDrzmAnNNx45sun2sh80X/zGoszI51NBYjXAtSdUsvpHPfAv7cG9Z1Rx\n3PCy7QoOcNqochb44cm0dQ0Zlq/PsE/vtgvakg648dVmLh/fhYaWL9ZKZwJoTuf3+6TtHYgBUErN\nVkpVK6Xyvh9c3EZyKbmBmimveCwz/vDHmscDcLBateLrZXPXnJB4o09PGg5im8HoygWNjBuS5JSR\n5UzZN8nT/0px4E11JBNw3eRK+nYNv3TCbfUsXZehrjlg6PWbufWUKqZkj9dvWtTMzDHhiH3IwAQB\ncPCf6pg6omy3U/0OynX2+Vfgf4E78h0gbpeajgbe1Z1DFE8fatafnXx26ZnJ5xKW+vQgpfKzcUgR\n9WVWTU5TdqWUBTwRBMHofAaIW8m7ImfYS1YZqZbjE28tmZF8uuaIxFKri0prXROegw3Mqumb6xdL\nybMsx1sDDNadQ+g3Un34/ozk0x9+JbmoV282j1YqcoefLzOrZnyuX1yokkftPyUX/0JKLoBlwV7D\nr0hdMPyK1AXbXkEXjFAfHxCRK+iKvSK0VXEs+TLgGN0hRLTU0r3XrempR92anrr1Crp3InAFXSTO\nH8Wx5IuAb+kOIaJrN1fQde9HzegiXkH3Sq5fqJS6B5gE9FNKrQZ+EQTBrfkIEcdj8jHAP3TnEPHU\njS11X02+uPicwl9B1wT0ZFbNzovuiyyOJU8CNcimjqLTguCoxHvvfT05d+2ExDtD8nwF3UJm1Ryd\nx+frsNiVHMByvPnAv+nOIcyyJ2s/Ob/smRX5uIIOuJ5ZNZflLVwnxPGYHOA5pOQizz6m/2A3NX1w\nPq6gox3H44UW15I/qzuAMFtnrqDLeqngIXMU1+l6BeFlp3I7UFF0bV1BB7zDrJoxrT5Yg1iWHMBy\nvLmE20QLoU0XWpqmJBa9O6PsmYaxasU+ZSozFPg1s2oc3dm2iut0HeABpORCs2bKKx7PHD3u8ebw\nRPpotWrlyclXHv2u5lzbitv15Nt6mAhsdyvEthYH++xxbWr6a7pzbCu2Jfddex3hTR6EiJLHfNfO\n/9YTnRDbkmfdrzuAEDt4WHeAHcW95A8hU3YRHRuBv+sOsaNYl9x37Q3APN05hMi6w3ftRt0hdhTr\nkmfJlF1ExZ91B2iNCSV/ANisO4QoeS/4rh2JTSJ2FPuS+65dC+TlulshOiGSozgYUPKsG4FIvW0h\nSsp6whllJBlRct+1feAR3TlEyfqr79qt3Hc1Gowoedb1ugOIkhQAf9EdYneMKbnv2i8Dr+rOIUrO\nQ75rL9cdYneMKXnWDboDiJKSAa7UHaItppX8QeB93SFEybjbd+33dIdoi1El9107Bfyn7hyiJKSA\nWbpD5MKokgP4rn0fEdpfSxjrNt+1Y3GXXeNKnvUj3QGE0ZqAX+kOkSsjS+679kJkTbsonL/4rv2R\n7hC5MrLkWT8l/I0rRD6tA36pO0R7GFvy7Cq43+vOIYzzH9ldiWLD2JJnXQ18pjuEMMZTvmvP0R2i\nvYwuue/aNcD3dOcQRqgHLtQdoiOMLjmA79oPAffqziFi7wrftT/QHaIjjC951iVAte4QIrZeBf6g\nO0RHlUTJfddeD3xLdw4RSy3ABb5rZ3QH6aiSKDmA79pPAH/UnUPEjuO79mLdITqjZEqe9WMg8hcU\niMh4xHft2O9TUFIl9117C3AWUKc7i4i8VcA3dIfIh5IqOUB26jWd8FpgIVqzBTjDd+1NuoPkQ8mV\nHMB37ccJl70K0Zpv+a79lu4Q+VKSJQfwXfu3wGzdOUTkXOe79j26Q+RTyZY860LgOd0hRGQ8Bji6\nQ+SbCoJAdwatLMfrS7jYYV/dWYRW8wA7ylsrd1Spj+RbF8pMI7yEUJSmhcCpJhYcpOQA+K69FDgO\nWKs7iyi6t4CpvmvX6w5SKCU/Xd+W5XgHAfOBAbqziKJYChzru7bRv9xlJN+G79pLgEnAp5qjiMLz\ngRNMLzhIyXeSvf3sJOATzVFE4SwHjvNd+2PdQYpBSt4K37WXERa9JH4ISszLwNG+a5fMTTik5LuQ\nvb/VsUAkbywvOuRB4PjsOyolQ0q+G75rrwK+DHi6s4hOuwH4d9+1G3UHKTY5u54Dy/ESwDXIevc4\nyhDusFqyO/dKydvBcrzpwC1Ale4sIiebgZm+az+sO4hOUvJ2shxvHPAIsKfuLGK3XgOmx+V+ZYUk\nx+Tt5Lv268A4YIHuLKJVAfBr4BgpeEhG8g6yHE8B/0F4rF6hOY4IfQKc77v2PN1BokRK3knZpbB3\nAmN1ZylxTwDfiNstjIpBSp4HluOVEZ55/zkyqhdbDeGOqv+nO0hUScnzyHK8UYRn38frzlIi7gIu\n811b7ne3G1LyPMseq59DeLNFS28aYy0BfuC79nzdQeJASl4gluN1AS4Gfgb00RzHFGuBK4GbfddO\n6w4TF1LyArMcrxfhvmE/RBbRdNRmwrvfXJu9U61oByl5kViONxT4JTADSGqOExfVwO+Bm0zZA10H\nKXmRWY63F3AR8G2gr+Y4UeUDvwVmZ+96IzpBSq6J5XiVwLnA94ExmuNExbuEq9Xu8107pTuMKaTk\nEWA53rGEZf8qpTeVXwfcD8zxXXuh7jAmkpJHiOV4gwmLfjowESjTm6hgtgCPA3OAp3zXbtGcx2hS\n8ojK3vThFMLCTyb+K+kagReAe4AHfdeu1ZynZEjJY8ByvB6Anf2YAOytN1FOWoBFhFtczwcWluKu\nLFEgJY8hy/GGEZb9KOAIwhN3lVpDQQPhibPnCUv9ou/ach/4CJCSG8ByvHJgNHAw4VJai3C0t4Bh\nQHkeX24tsAJYmf28mLDc7/uuLfd8jyApueGy+9MNISz8YMJVdxWEI3/lNn+uIDzRVwNsyn5s3OHz\nBhmd40dKLoThZPsnIQwnJRfCcFJyIQwnJRfCcFJyIQwnJRfCcFJyIQwnJRfCcFJyIQwnJRfCcFJy\nIQwnJRfCcFJyIQwnJRfCcFJyIQwnJRfCcFJyIQwnJRfCcFJyIQwnJRfCcFJyIQwnJRfCcFJyIQwn\nJRfCcFJyIQwnJRfCcFJyIQwnJRfCcFJyIQwnJRfCcFJyIQz3/zbtcAuj3nkLAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "train['y'].value_counts().plot.pie(autopct = '%1.2f%%')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_train = train['y']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "for i in range(96,158):\n",
    "    col = 'x'+'_'+str(i)\n",
    "    dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col + str(x))\n",
    "    x = pd.concat([x, dummies_df], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>x_1</th>\n",
       "      <th>x_2</th>\n",
       "      <th>x_3</th>\n",
       "      <th>x_4</th>\n",
       "      <th>x_5</th>\n",
       "      <th>x_6</th>\n",
       "      <th>x_7</th>\n",
       "      <th>x_8</th>\n",
       "      <th>x_9</th>\n",
       "      <th>x_10</th>\n",
       "      <th>...</th>\n",
       "      <th>x_1561</th>\n",
       "      <th>x_1562</th>\n",
       "      <th>x_1563</th>\n",
       "      <th>x_157-99</th>\n",
       "      <th>x_1571</th>\n",
       "      <th>x_1572</th>\n",
       "      <th>x_1573</th>\n",
       "      <th>x_1574</th>\n",
       "      <th>x_15710</th>\n",
       "      <th>x_15711</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.354167</td>\n",
       "      <td>0.604988</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.012058</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.565979</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.208333</td>\n",
       "      <td>0.316209</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.208333</td>\n",
       "      <td>0.008061</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 355 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        x_1       x_2  x_3  x_4  x_5  x_6  x_7  x_8  x_9  x_10   ...     \\\n",
       "0  0.354167  0.604988  -99  -99  -99  -99  -99  -99  -99   -99   ...      \n",
       "1  0.125000  0.012058  -99  -99  -99  -99  -99  -99  -99   -99   ...      \n",
       "2  0.333333  0.565979    0    0    0    0    0    0    0     0   ...      \n",
       "3  0.208333  0.316209    0    0    0    0    1    1    0     0   ...      \n",
       "4  0.208333  0.008061  -99  -99  -99  -99  -99  -99    0     1   ...      \n",
       "\n",
       "   x_1561  x_1562  x_1563  x_157-99  x_1571  x_1572  x_1573  x_1574  x_15710  \\\n",
       "0       0       0       1         1       0       0       0       0        0   \n",
       "1       0       1       0         0       0       1       0       0        0   \n",
       "2       0       1       0         0       0       1       0       0        0   \n",
       "3       0       1       0         0       0       0       0       1        0   \n",
       "4       0       1       0         0       1       0       0       0        0   \n",
       "\n",
       "   x_15711  \n",
       "0        0  \n",
       "1        0  \n",
       "2        0  \n",
       "3        0  \n",
       "4        0  \n",
       "\n",
       "[5 rows x 355 columns]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(15000, 355)\n",
      "(10000, 355)\n"
     ]
    }
   ],
   "source": [
    "train_X = x[0:15000]\n",
    "test_X = x[15000:25000]\n",
    "print(train_X.shape)\n",
    "print(test_X.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn import metrics\n",
    "from sklearn.model_selection import train_test_split\n",
    "from xgboost import XGBClassifier\n",
    "from sklearn.linear_model import LogisticRegression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "X_train,X_val,y_train,y_val= train_test_split(train_X,y_train,test_size=0.2,random_state=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "cls = LogisticRegression().fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.8064204809197741\n"
     ]
    }
   ],
   "source": [
    "predictions = cls.predict_proba(X_val)\n",
    "pre = predictions[:,1]\n",
    "val_auc = metrics.roc_auc_score(y_val,pre)#验证集上的auc值\n",
    "print(val_auc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10000,)"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preds = gbm.predict_proba(test_X)\n",
    "pred = preds[:,1]\n",
    "pred.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n",
    "Submission.to_csv('Submission.csv',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
